# Import libraries
import math
import numpy as np
from dataclasses import dataclass, field
from typing import List, Tuple
from pprint import pprint
import matplotlib.pyplot as pltSynthetic Clinical Trial for Down’s Syndrome
Given measured device performace metrics (such as sensitivity, selectivity, storage and sample processing variations), estimate the ROC curve. In other given some device variation data, generate the AUC value that would be achieved by the test in clinic?
In other words, simulate an observational clinical trial for Down’s syndrome.
Inputs:
Device Caliberation Curve with Error Bars, Storage and processing variation data with Error Bars
Outputs:
ROC Curve and AUC valuePlan
Here is some sample code that we would eventually wish to write:
theoratical_max_roc, theoratical_max_auc = device_model()
roc, auc = device_model(input_file="./raw_data.csv")
plot_roc(roc)
print("AUC based on current device = ", auc)
dr = detection_rate(roc, fpr=0.05)
print("Detection rate achieved at 5% FPR is ", dr)Proposed csv format for Device Caliberation Curve
concentration,mean,std
-8.32,23.33,3.1
...
Proposed csv format for Storage and Processing Variation data
marker,elapsed_days,mean,std
bhcg,8,23.33,3.1
bhcg,10,24.53,4.2
...
Process
- ✅ Produce a model of population distribution for down syndrome
- Define probability distribution of samples
- Generate lots of samples based on the probability distribution (Age, b-hcg, papp-a, NT scan)
- Model and simulate storage and processing errors
- Model and simulate sensitivity and selectivity of our sensor
- ✅ Define a diagnostic algorithm, generate RUC curve and AUC value
Example 1. P(H) = 0.5, P(T) = 0.5 : Probability Distribution : Normal Distribution ( Average(X), SD(X) ) / Log Normal Distribution ( Average(log(X)), SD(log(X) ) 2. Random Sampling (take my distribution) -> then produce a random sample
Note
MoM: Multiples of Median is used as a gestational age normalized form of the raw values from different markers.
Modeling of population distribution for down syndrome
# Some helper lambdas to make switching bases easier to log
# Log base 10
log = np.log10
inv_log = lambda x: np.pow(10, x)
# Natural Logs
# lg = np.log
# inv_lg = lambda x: np.exp(x)@dataclass
class Marker:
"""This class is used to define a particular marker. Example, b-hCG."""
name: str # Name of the marker, example: b-hCG
median_mom_down: float # median MoM for down syndrome patients
median_mom_control: float # median MoM for control patients
log_sd_down: float = 0.0 # log of the sd of the marker's MoM values in Down Syndrome cases
log_sd_control: float = 0.0 # log of the sd of the marker's MoM values in Control cases@dataclass
class PopulationConfig:
"""
Maternal Age is Log Normally distributed.
"""
maternal_age_mean: float = 27.0 # Mean/Average of maternal age in years
maternal_age_sd: float = 5.5 # SD of maternal age in years
down_syndrome_prevalence: float = 1/700 # Prevalence of Down syndrome in the population# Create the configuration for Population with default values
pop_config = PopulationConfig()
pprint(pop_config)PopulationConfig(maternal_age_mean=27.0,
maternal_age_sd=5.5,
down_syndrome_prevalence=0.0014285714285714286)
# Define the list of markers
markers: list[Marker] = [
Marker(name="Free B-hCG", median_mom_down=1.70, median_mom_control=1.01, log_sd_down=0.28, log_sd_control=0.27),
Marker(name="PAPP-A", median_mom_down=0.49, median_mom_control=1.00, log_sd_down=0.31, log_sd_control=0.25),
Marker(name="NT", median_mom_down=1.74, median_mom_control=1.01, log_sd_down=0.23, log_sd_control=0.13),
]
markers_count = len(markers)Generate lots of samples based on the probability distribution
sample_size: int = 1_00_000 # Number of samples to be generated from the distribution# If needed, we can make make the random functions behave the same every time by choosing a seed value
np.random.seed(42)# Randomly (normally) assign down syndrome to patients based on down syndrome prevalence (usually 1/700)
has_down = np.random.random(sample_size) < pop_config.down_syndrome_prevalence
has_downarray([False, False, False, ..., False, False, False])
sampled_prevalence = (np.sum(has_down) / sample_size)
print(f"We get the prevalence of {sampled_prevalence*100:.2f}% which is similar to the expected value of {1/7:.2f}%")
assert np.allclose(sampled_prevalence, pop_config.down_syndrome_prevalence, atol=1/2000), "Sampled prevalence should be similar to expected prevalence"We get the prevalence of 0.14% which is similar to the expected value of 0.14%
min_max = lambda v, tol: ((v - tol) * 100, (v + tol) * 100)
min_max(0.0014, 1/2000)(0.09, 0.19)
Let X be a random variable with a log normal distribution \(N(\mu_X, \sigma^2_X\)). Then the \(ln(X)\) has the mean \(\mu\) and variance \(\sigma^2\).
\[ \begin{align} \mu &= ln({\frac{\mu^2_X}{\sqrt{\mu^2_X + \sigma^2_X}}}) \\ \sigma^2 &= ln(1 + \frac{\sigma^2_X}{\mu^2_X}) \end{align} \]
mu = log(pop_config.maternal_age_mean**2 / (np.sqrt(pop_config.maternal_age_mean**2 + pop_config.maternal_age_sd**2)))
sig2 = log(1 + (pop_config.maternal_age_sd ** 2 / pop_config.maternal_age_mean ** 2))
sig = np.sqrt(sig2)
log_maternal_ages = np.random.normal(
mu,
sig,
sample_size
)
print(
mu, sig
)
log_maternal_ages1.4225351280228231 0.13288066929515427
array([1.53451514, 1.75791092, 1.40922959, ..., 1.46250808, 1.57402931,
1.56990077])
def summary_stats(data: np.ndarray):
print(f"X ~ N(μ, σ^2): N({np.mean(data):.4f}, {np.std(data):.4f})")
print(f"Range: {np.min(data):.4f} ≤ X ≤ {np.max(data):.4f}")
print(f"median (M): {np.median(data):.4f}\n")inverse of log is exponential
# maternal_age = np.exp(log_maternal_ages)
maternal_age = inv_log(log_maternal_ages)
# print(maternal_age)
plt.hist(maternal_age, bins=50)(array([4.9000e+01, 3.3100e+02, 1.1980e+03, 2.8390e+03, 4.9030e+03,
7.1890e+03, 8.9480e+03, 9.9030e+03, 1.0371e+04, 9.6740e+03,
8.7290e+03, 7.6180e+03, 6.3390e+03, 5.1430e+03, 3.9980e+03,
3.2360e+03, 2.4350e+03, 1.8400e+03, 1.3560e+03, 1.0230e+03,
7.7300e+02, 5.7800e+02, 4.2300e+02, 3.1200e+02, 2.3000e+02,
1.6100e+02, 1.1900e+02, 8.4000e+01, 5.3000e+01, 3.5000e+01,
3.4000e+01, 1.8000e+01, 1.9000e+01, 9.0000e+00, 6.0000e+00,
5.0000e+00, 7.0000e+00, 6.0000e+00, 2.0000e+00, 1.0000e+00,
0.0000e+00, 1.0000e+00, 0.0000e+00, 0.0000e+00, 1.0000e+00,
0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 1.0000e+00]),
array([ 7.78217839, 9.76342391, 11.74466944, 13.72591496,
15.70716048, 17.688406 , 19.66965152, 21.65089704,
23.63214256, 25.61338808, 27.5946336 , 29.57587913,
31.55712465, 33.53837017, 35.51961569, 37.50086121,
39.48210673, 41.46335225, 43.44459777, 45.4258433 ,
47.40708882, 49.38833434, 51.36957986, 53.35082538,
55.3320709 , 57.31331642, 59.29456194, 61.27580746,
63.25705299, 65.23829851, 67.21954403, 69.20078955,
71.18203507, 73.16328059, 75.14452611, 77.12577163,
79.10701716, 81.08826268, 83.0695082 , 85.05075372,
87.03199924, 89.01324476, 90.99449028, 92.9757358 ,
94.95698133, 96.93822685, 98.91947237, 100.90071789,
102.88196341, 104.86320893, 106.84445445]),
<BarContainer object of 50 artists>)

summary_stats(log_maternal_ages)
# summary_stats(np.exp(log_maternal_ages))
summary_stats(inv_log(log_maternal_ages))X ~ N(μ, σ^2): N(1.4229, 0.1326)
Range: 0.8911 ≤ X ≤ 2.0288
median (M): 1.4226
X ~ N(μ, σ^2): N(27.7423, 8.6749)
Range: 7.7822 ≤ X ≤ 106.8445
median (M): 26.4590
Log normal distribution is defined as: \[ Y = ln(X), Y \sim N(\mu, \sigma) \]
In other words, where the log of the random variable Y is normally distributed.
For a log normally distributed random variable X. The median for X is just the exponential of its mean. \[ M = e^\mu => ln(M) = \mu \]
# Example
avg = log(0.49)
avgnp.float64(-0.3098039199714863)
# Calculate mean of log10 values for all markers for both down's samples and healthy samples
mean_down = log([m.median_mom_down for m in markers]) # Mean for all markers for down's patients
# For example:
# Median MoM value of Papp-a for Down's is 0.49
# Given the above relationship between mean and median for a log-normally distributed variable,
# we can state that the Mean of Papp-a MoM values for Down's is log10(0.49) = -0.31
mean_control = log([m.median_mom_control for m in markers]) # Mean for all markers for healthy patients
mean_down, mean_control(array([ 0.23044892, -0.30980392, 0.24054925]),
array([0.00432137, 0. , 0.00432137]))
Covariance Matrices
Assuming that \(X_i\) for all \(i=0...n\) are independent random variables, the \(Cov(X_i, Y_j) = 0\) and hence we get the following diagnol form containing only the variances.
\[ Cov(X, X) = \begin{bmatrix} Var(X_1) & 0 & 0\\ 0 & Var(X_2) & 0\\ 0 & 0 & Var(X_3) \end{bmatrix} = \begin{bmatrix} \sigma^2(X_1) & 0 & 0\\ 0 & \sigma^2(X_2) & 0\\ 0 & 0 & \sigma^2(X_3) \end{bmatrix} = diag(\sigma^2(X_1), \sigma^2(X_2), \sigma^2(X_3)) \]
The covariance matrix is a diagonal matrix containing only the variances of each of the markers.
Concretely, \[ Cov(X, X) = \begin{bmatrix} Var(Pappa) & 0 & 0\\ 0 & Var(\beta hCG) & 0\\ 0 & 0 & Var(NT) \end{bmatrix} = \begin{bmatrix} \sigma^2(Pappa) & 0 & 0\\ 0 & \sigma^2(\beta hCG) & 0\\ 0 & 0 & \sigma^2(NT) \end{bmatrix} = diag(\sigma^2(Pappa), \sigma^2(\beta hCG), \sigma^2(NT)) \]
variance_matrix_down = np.diag([m.log_sd_down**2 for m in markers])
variance_matrix_control = np.diag([m.log_sd_control**2 for m in markers])
variance_matrix_down, variance_matrix_control(array([[0.0784, 0. , 0. ],
[0. , 0.0961, 0. ],
[0. , 0. , 0.0529]]),
array([[0.0729, 0. , 0. ],
[0. , 0.0625, 0. ],
[0. , 0. , 0.0169]]))
sd_matrix_down = np.sqrt(variance_matrix_down)
sd_matrix_control = np.sqrt(variance_matrix_control)
sd_matrix_down, sd_matrix_control(array([[0.28, 0. , 0. ],
[0. , 0.31, 0. ],
[0. , 0. , 0.23]]),
array([[0.27, 0. , 0. ],
[0. , 0.25, 0. ],
[0. , 0. , 0.13]]))
np.sqrt(0.0784)np.float64(0.27999999999999997)
Correlation Matrices
[m.name for m in markers]['Free B-hCG', 'PAPP-A', 'NT']
\[ \begin{bmatrix} Free B-hCG / Free B-hCG & Free B-hCG / PAPP-A & Free B-hCG / NT\\ PAPP-A / Free B-hCG & PAPP-A / PAPP-A & PAPP-A / NT\\ NT / Free B-hCG & NT / PAPP-A & NT / NT\\ \end{bmatrix} \]
Note: Correlations of markers with NT is assumed to be zero as per the paper (table 3)
# Correlation between different markers for Down's samples
correlation_matrix_down = np.array(
[[1., 0.191, 0.],
[0.191, 1., 0.],
[0., 0., 1.]]
)# Correlation between different markers for Healthy samples
correlation_matrix_control = np.array(
[[1., 0.186, 0.],
[0.186, 1., 0.],
[0., 0., 1.]]
)sd_matrix_down.shape, correlation_matrix_down.shape((3, 3), (3, 3))
\[ Cov(X) = \sqrt{Cov(X)} Corr(X) \sqrt{Cov(X)} \]
# Covariance matrix of all markers for Down's patients
cov_down = sd_matrix_down @ correlation_matrix_down @ sd_matrix_down
cov_downarray([[0.0784 , 0.0165788, 0. ],
[0.0165788, 0.0961 , 0. ],
[0. , 0. , 0.0529 ]])
# Covariance matrix of all markers for Healthy patients
cov_control = sd_matrix_control @ correlation_matrix_control @ sd_matrix_control
cov_controlarray([[0.0729 , 0.012555, 0. ],
[0.012555, 0.0625 , 0. ],
[0. , 0. , 0.0169 ]])
Generate Marker Values for all patient samples
# Sample the marker values for each of the down's patients
log_marker_values_down = np.random.multivariate_normal(
mean_down, # Mean of all markers of all markers for Down's patients
cov_down, # Covariance matrix of all markers for Down's patients
np.sum(has_down) # We want to sample these marker values for ALL the down's samples only
)
log_marker_values_down.shape #, log_marker_values_down(143, 3)
# Sample the marker values for each of the healthy patients
log_marker_values_control = np.random.multivariate_normal(
mean_control, # Mean of all markers of all markers for healthy patients
cov_control, # Covariance matrix of all markers for Healthy patients
np.sum(~has_down) # We want to sample these marker values for ALL the healthy samples only
)
log_marker_values_control.shape #, log_marker_values_control(99857, 3)
Now, lets put all the marker values together into a single big matrix
log_marker_values = np.zeros((sample_size, len(markers)))
log_marker_values.shape(100000, 3)
log_marker_values[has_down] = log_marker_values_down
log_marker_values[~has_down] = log_marker_values_controllog_marker_valuesarray([[-0.07171681, -0.3299774 , 0.05611652],
[-0.3196752 , 0.53406245, -0.10086142],
[-0.16968009, 0.28278058, -0.19569757],
...,
[ 0.05791486, 0.06330189, -0.154301 ],
[ 0.28925172, -0.02463672, 0.19783116],
[-0.09302042, 0.06682716, -0.1293075 ]])
Convert all the marker values from log(MoM) to MoM values
# marker_values = np.exp(log_marker_values)
marker_values = inv_log(log_marker_values)
marker_valuesarray([[0.84778005, 0.46775948, 1.13793254],
[0.47898819, 3.42028619, 0.79275425],
[0.67658117, 1.91769963, 0.63723912],
...,
[1.14265431, 1.15691616, 0.70096931],
[1.94648795, 0.94485091, 1.57699805],
[0.80719708, 1.16634535, 0.74249324]])
fig = plt.figure()
ax = fig.add_subplot(111, projection='3d')
ax.scatter(
marker_values[:,0],
marker_values[:,1],
# marker_values[:,2],
maternal_age,
c=~has_down
)
import plotly.graph_objects as go
import plotly.io as pio
import numpy as np
def create_interactive_plot(marker_values, maternal_age, is_down, markers, z_axis='maternal_age'):
# pio.renderers.default = "browser"
# Separate Down syndrome and control cases
down_indices = np.where(is_down)[0]
control_indices = np.where(~is_down)[0]
# Determine z-axis values
if z_axis == 'maternal_age':
z_values = maternal_age
z_axis_title = 'Maternal Age'
elif z_axis == 'NT':
z_values = marker_values[:, 2]
z_axis_title = markers[2].name
else:
raise ValueError("z_axis must be either 'maternal_age' or 'NT'")
# Create scatter plots for Down syndrome and control cases separately
scatter_down = go.Scatter3d(
x=marker_values[down_indices, 0],
y=marker_values[down_indices, 1],
z=z_values[down_indices],
mode='markers',
marker=dict(
size=5,
color='red',
symbol='diamond',
),
name='Down Syndrome',
text=[f"Age: {age:.1f}, {z_axis_title}: {z:.2f}, Down Syndrome: True"
for age, z in zip(maternal_age[down_indices], z_values[down_indices])],
hoverinfo="text"
)
scatter_control = go.Scatter3d(
x=marker_values[control_indices, 0],
y=marker_values[control_indices, 1],
z=z_values[control_indices],
mode='markers',
marker=dict(
size=3,
color='blue',
opacity=0.1,
),
name='Control',
text=[f"Age: {age:.1f}, {z_axis_title}: {z:.2f}, Down Syndrome: False"
for age, z in zip(maternal_age[control_indices], z_values[control_indices])],
hoverinfo="text"
)
# Create the layout
layout = go.Layout(
scene=dict(
xaxis_title=markers[0].name,
yaxis_title=markers[1].name,
zaxis_title=z_axis_title,
),
title="Down Syndrome Screening Markers",
width=900,
height=700,
legend=dict(
yanchor="top",
y=0.99,
xanchor="left",
x=0.01
)
)
# Create the figure and show it
fig = go.Figure(data=[scatter_control, scatter_down], layout=layout)
fig.show()create_interactive_plot(marker_values, maternal_age, has_down, markers, z_axis='NT')